In [25]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score,auc,  precision_recall_curve
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly import tools
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import xgboost as xgb
In [27]:
df = pd.read_csv("/Users/motlegoland/Desktop/data analyst/projects/project portfolio/fraud detection/data/creditcard_csv.csv")

Credit Fraud Detection Project¶

data exploration¶

In [29]:
df
Out[29]:
Time V1 V2 V3 V4 V5 V6 V7 V8 V9 ... V21 V22 V23 V24 V25 V26 V27 V28 Amount Class
0 0.0 -1.359807 -0.072781 2.536347 1.378155 -0.338321 0.462388 0.239599 0.098698 0.363787 ... -0.018307 0.277838 -0.110474 0.066928 0.128539 -0.189115 0.133558 -0.021053 149.62 '0'
1 0.0 1.191857 0.266151 0.166480 0.448154 0.060018 -0.082361 -0.078803 0.085102 -0.255425 ... -0.225775 -0.638672 0.101288 -0.339846 0.167170 0.125895 -0.008983 0.014724 2.69 '0'
2 1.0 -1.358354 -1.340163 1.773209 0.379780 -0.503198 1.800499 0.791461 0.247676 -1.514654 ... 0.247998 0.771679 0.909412 -0.689281 -0.327642 -0.139097 -0.055353 -0.059752 378.66 '0'
3 1.0 -0.966272 -0.185226 1.792993 -0.863291 -0.010309 1.247203 0.237609 0.377436 -1.387024 ... -0.108300 0.005274 -0.190321 -1.175575 0.647376 -0.221929 0.062723 0.061458 123.50 '0'
4 2.0 -1.158233 0.877737 1.548718 0.403034 -0.407193 0.095921 0.592941 -0.270533 0.817739 ... -0.009431 0.798278 -0.137458 0.141267 -0.206010 0.502292 0.219422 0.215153 69.99 '0'
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
284802 172786.0 -11.881118 10.071785 -9.834783 -2.066656 -5.364473 -2.606837 -4.918215 7.305334 1.914428 ... 0.213454 0.111864 1.014480 -0.509348 1.436807 0.250034 0.943651 0.823731 0.77 '0'
284803 172787.0 -0.732789 -0.055080 2.035030 -0.738589 0.868229 1.058415 0.024330 0.294869 0.584800 ... 0.214205 0.924384 0.012463 -1.016226 -0.606624 -0.395255 0.068472 -0.053527 24.79 '0'
284804 172788.0 1.919565 -0.301254 -3.249640 -0.557828 2.630515 3.031260 -0.296827 0.708417 0.432454 ... 0.232045 0.578229 -0.037501 0.640134 0.265745 -0.087371 0.004455 -0.026561 67.88 '0'
284805 172788.0 -0.240440 0.530483 0.702510 0.689799 -0.377961 0.623708 -0.686180 0.679145 0.392087 ... 0.265245 0.800049 -0.163298 0.123205 -0.569159 0.546668 0.108821 0.104533 10.00 '0'
284806 172792.0 -0.533413 -0.189733 0.703337 -0.506271 -0.012546 -0.649617 1.577006 -0.414650 0.486180 ... 0.261057 0.643078 0.376777 0.008797 -0.473649 -0.818267 -0.002415 0.013649 217.00 '0'

284807 rows × 31 columns

In [32]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     284807 non-null  float64
 22  V22     284807 non-null  float64
 23  V23     284807 non-null  float64
 24  V24     284807 non-null  float64
 25  V25     284807 non-null  float64
 26  V26     284807 non-null  float64
 27  V27     284807 non-null  float64
 28  V28     284807 non-null  float64
 29  Amount  284807 non-null  float64
 30  Class   284807 non-null  object 
dtypes: float64(30), object(1)
memory usage: 67.4+ MB

no null values detected, now we will convert the 'Class' series to an integer and Identify the proportion of fraudulent transactions compared to non-fraudulent ones.

In [35]:
df['Class'] = df['Class'].str.strip("'").astype('int8')
In [37]:
df['Class'].value_counts(normalize=True)
Out[37]:
Class
0    0.998273
1    0.001727
Name: proportion, dtype: float64

very low fraud case proportion means the data set is unbalanced

In [40]:
df.describe().round(2)
Out[40]:
Time V1 V2 V3 V4 V5 V6 V7 V8 V9 ... V21 V22 V23 V24 V25 V26 V27 V28 Amount Class
count 284807.00 284807.00 284807.00 284807.00 284807.00 284807.00 284807.00 284807.00 284807.00 284807.00 ... 284807.00 284807.00 284807.00 284807.00 284807.00 284807.00 284807.00 284807.00 284807.00 284807.00
mean 94813.86 0.00 0.00 -0.00 0.00 0.00 0.00 -0.00 0.00 -0.00 ... 0.00 -0.00 0.00 0.00 0.00 0.00 -0.00 -0.00 88.35 0.00
std 47488.15 1.96 1.65 1.52 1.42 1.38 1.33 1.24 1.19 1.10 ... 0.73 0.73 0.62 0.61 0.52 0.48 0.40 0.33 250.12 0.04
min 0.00 -56.41 -72.72 -48.33 -5.68 -113.74 -26.16 -43.56 -73.22 -13.43 ... -34.83 -10.93 -44.81 -2.84 -10.30 -2.60 -22.57 -15.43 0.00 0.00
25% 54201.50 -0.92 -0.60 -0.89 -0.85 -0.69 -0.77 -0.55 -0.21 -0.64 ... -0.23 -0.54 -0.16 -0.35 -0.32 -0.33 -0.07 -0.05 5.60 0.00
50% 84692.00 0.02 0.07 0.18 -0.02 -0.05 -0.27 0.04 0.02 -0.05 ... -0.03 0.01 -0.01 0.04 0.02 -0.05 0.00 0.01 22.00 0.00
75% 139320.50 1.32 0.80 1.03 0.74 0.61 0.40 0.57 0.33 0.60 ... 0.19 0.53 0.15 0.44 0.35 0.24 0.09 0.08 77.16 0.00
max 172792.00 2.45 22.06 9.38 16.88 34.80 73.30 120.59 20.01 15.59 ... 27.20 10.50 22.53 4.58 7.52 3.52 31.61 33.85 25691.16 1.00

8 rows × 31 columns

In [44]:
class_0 = df.loc[df['Class'] == 0]["Time"]
class_1 = df.loc[df['Class'] == 1]["Time"]

hist_data = [class_0, class_1]
group_labels = ['Not Fraud', 'Fraud']

fig = ff.create_distplot(hist_data, group_labels, show_hist=False, show_rug=False)
fig['layout'].update(title='Credit Card Transactions Time Density Plot', xaxis=dict(title='Time [s]'))
iplot(fig, filename='dist_only')

the dips in regular transactions are probably due to low acticvity during the night in europe, the fraud transactions have a more even distribution.

In [93]:
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), cmap="coolwarm", annot=False)
plt.title("Correlation Heatmap")
plt.show()
No description has been provided for this image

visualizing the correlation of the variables, we can notice that V2, V4, V11, V12, V14, V17 have a noticeable correlation with 'Class'.

2. logistic regression¶

we will establish a logistic regression model as a baseline model.
SMOTE is used to handle the imbalance.

In [98]:
X = df.drop('Class', axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

scaling the variables is necessary for this PCA transformed dataset when using gradient-based algorithms like the logistic regression.

In [101]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_resampled)
X_test_scaled = scaler.transform(X_test)
In [103]:
#train the model
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_scaled, y_train_resampled)

y_pred = log_reg.predict(X_test_scaled)
y_pred_prob = log_reg.predict_proba(X_test_scaled)[:, 1]
In [105]:
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)
pr_auc = auc(recall, precision)
print("Precision-Recall AUC:", pr_auc)

roc_auc = roc_auc_score(y_test, y_pred_prob)
print("ROC-AUC Score:", roc_auc)
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      0.99     56864
           1       0.13      0.90      0.23        98

    accuracy                           0.99     56962
   macro avg       0.57      0.94      0.61     56962
weighted avg       1.00      0.99      0.99     56962

Confusion Matrix:
 [[56299   565]
 [   10    88]]
Precision-Recall AUC: 0.7776995057198918
ROC-AUC Score: 0.9772273516187566
In [107]:
# Precision-Recall Curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, label=f'PR AUC = {pr_auc:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend()
plt.show()
No description has been provided for this image

the logistic regression shows good recall and bad precision. the Precision-Recall Curve plot shows that the best scenario for a balanced model is around 0.8 for both prescisoin and recall.
recall is highly important for fraud detection models because of the high cost of missing fraud cases.
next we will compare this model with a random forrest regression.

3. random forrest regression¶

we will use functions in order to train and evaluate the model to make future adjustments easier.

In [112]:
#base model
def train_random_forest(X_train, y_train):
    rf = RandomForestClassifier(n_estimators=200, max_depth=None, class_weight='balanced', random_state=42) 
    #random forest isn't prone to overfitting, but 200 estimators is inside the norm for these projects and it's as high as my pc will go
    rf.fit(X_train, y_train)
    return rf

rf = train_random_forest(X_train, y_train)
In [113]:
#evaluation
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]  
    print(classification_report(y_test, y_pred))
    print("AUC-ROC:", roc_auc_score(y_test, y_prob))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    return y_prob

y_prob = evaluate_model(rf, X_test, y_test)
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.96      0.76      0.85        98

    accuracy                           1.00     56962
   macro avg       0.98      0.88      0.92     56962
weighted avg       1.00      1.00      1.00     56962

AUC-ROC: 0.9571890288895525
Confusion Matrix:
 [[56861     3]
 [   24    74]]
In [114]:
average_fraud_amount = df.loc[df['Class'] == 1, 'Amount'].mean()
average_fraud_amount
Out[114]:
122.21132113821139

we won't adjust the class weight in the model because the lack of information about the cost of false positives.
the cost of false negative can be calculated, so if we recieve information about the false positive cost, it would be easier to tune the model.

In [120]:
def plot_precision_recall_curve(y_test, y_prob):
    precision, recall, thresholds = precision_recall_curve(y_test, y_prob)
    plt.plot(recall, precision, marker='.')
    plt.title('Precision-Recall Curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.grid(which='both', alpha=1)
    plt.xticks(np.arange(0, 1.1, 0.1))
    plt.yticks(np.arange(0, 1.1, 0.05))
    plt.show()

plot_precision_recall_curve(y_test, y_prob)
No description has been provided for this image

as we can see in the graph the random forrest model shows a better balance between accuarcy and precision,
we will add a function for threshold adjustment and find the optimal point.

In [124]:
# Threshold Adjustment
def adjust_threshold(model, X_test, y_test, threshold=0.5):
    y_prob = model.predict_proba(X_test)[:, 1]
    # Adjust threshold
    y_pred_threshold = (y_prob >= threshold).astype(int)
    print(f"Classification Report at Threshold {threshold}:")
    print(classification_report(y_test, y_pred_threshold))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_threshold))
    return y_pred_threshold

y_pred_threshold = adjust_threshold(rf, X_test, y_test, threshold=0.5)
Classification Report at Threshold 0.5:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.96      0.76      0.85        98

    accuracy                           1.00     56962
   macro avg       0.98      0.88      0.92     56962
weighted avg       1.00      1.00      1.00     56962

Confusion Matrix:
 [[56861     3]
 [   24    74]]

before we start tuning, we check the precision-recall curve to understand the potential of the threshold adjustment.

we can tune until we get to 0.85 recall

In [128]:
y_pred_threshold = adjust_threshold(rf, X_test, y_test, threshold=0.45)
Classification Report at Threshold 0.45:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.96      0.77      0.85        98

    accuracy                           1.00     56962
   macro avg       0.98      0.88      0.93     56962
weighted avg       1.00      1.00      1.00     56962

Confusion Matrix:
 [[56861     3]
 [   23    75]]
In [130]:
y_pred_threshold = adjust_threshold(rf, X_test, y_test, threshold=0.4)
Classification Report at Threshold 0.4:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.95      0.80      0.87        98

    accuracy                           1.00     56962
   macro avg       0.98      0.90      0.93     56962
weighted avg       1.00      1.00      1.00     56962

Confusion Matrix:
 [[56860     4]
 [   20    78]]
In [132]:
y_pred_threshold = adjust_threshold(rf, X_test, y_test, threshold=0.35)
Classification Report at Threshold 0.35:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.95      0.82      0.88        98

    accuracy                           1.00     56962
   macro avg       0.98      0.91      0.94     56962
weighted avg       1.00      1.00      1.00     56962

Confusion Matrix:
 [[56860     4]
 [   18    80]]
In [134]:
y_pred_threshold = adjust_threshold(rf, X_test, y_test, threshold=0.3)
Classification Report at Threshold 0.3:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.93      0.84      0.88        98

    accuracy                           1.00     56962
   macro avg       0.97      0.92      0.94     56962
weighted avg       1.00      1.00      1.00     56962

Confusion Matrix:
 [[56858     6]
 [   16    82]]
In [136]:
y_pred_threshold = adjust_threshold(rf, X_test, y_test, threshold=0.25)
Classification Report at Threshold 0.25:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.88      0.85      0.86        98

    accuracy                           1.00     56962
   macro avg       0.94      0.92      0.93     56962
weighted avg       1.00      1.00      1.00     56962

Confusion Matrix:
 [[56853    11]
 [   15    83]]
In [138]:
y_pred_threshold = adjust_threshold(rf, X_test, y_test, threshold=0.2)
Classification Report at Threshold 0.2:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.86      0.85      0.86        98

    accuracy                           1.00     56962
   macro avg       0.93      0.92      0.93     56962
weighted avg       1.00      1.00      1.00     56962

Confusion Matrix:
 [[56851    13]
 [   15    83]]

with the threshold of 0.25 we reach f1-score of 0.88 and high recall rate without reducing precision.

conclusons¶

  1. The threshold-adjusted Random Forest model is recommended for deployment, given its ability to maintain high recall while significantly improving precision. This ensures effective fraud detection, minimizing financial loss, and reduced false alarms, maximizing user experience.
  2. future improvements: implement feature importance analysis and time-based patterns to further enhance the model, and experiment with XGBoosting, to see if performance can be further improved.
In [ ]: